##########
#Quality of Legislation and Compliance: A Natural Language Processing Approach
#Moritz Osnabruegge, Matia Vannoni
#This script produces Figures 1-2, Figures A1-A5 and Table A4
#########



library(corrplot)
library(dplyr)
library(ggplot2)
library(grid)
library(vtable)


data <- read.csv("data.csv", encoding="UTF-8")



######################  
###FIGURE 1
######################

pdf("figure_1a.pdf", width=7.5, height=5)
par(mar=c(5,5,1,2)) 
d1 <- density(data$syntactic_complexity, from=0) 
plot(d1, ylab="Density", xlab="Syntactic Complexity", main=NA,  cex.lab=1.7, cex.axis=1.7) 
abline(v=mean(data$syntactic_complexity), col="darkgrey", lty=2)
rug(data$syntactic_complexity)
dev.off()

pdf("figure_1b.pdf", width=7.5, height=5)
par(mar=c(5,5,1,2))
d1 <- density(data$vagueness) 
plot(d1, ylab="Density", xlab="Vagueness", main=NA,  cex.lab=1.7, cex.axis=1.7) 
abline(v=mean(data$vagueness), col="darkgrey", lty=2)
rug(data$vagueness)
dev.off()



######################  
###FIGURE 2
######################

r_implementation <- data %>% group_by(celex) %>% summarise(compliance=sum(implementation))

data$x=1
r_total <- data %>% group_by(celex) %>%  summarise(total=sum(x))

r_measures <- subset(data, select=c("celex", "vagueness", "syntactic_complexity"))
r_measures <- unique(r_measures)

r <- merge(r_implementation, r_total, by=c("celex"))
r <- merge(r, r_measures, by=c("celex"))

r$compliance_percent <- (r$compliance/r$total)*100


#Panel a
p <- ggplot(r, aes(x=syntactic_complexity, y=compliance_percent)) + 
  geom_point() +
  geom_smooth(method="lm", se=F) + 
  scale_x_continuous(name="Syntactic Complexity", limits=c(3,11), breaks=seq(3,11, by=1)) +
  scale_y_continuous(name="Percentage") +
  theme_bw() + theme(axis.line=element_line(color="black"),
                     axis.text.x=element_text(size=16, colour="black"),
                     axis.text.y=element_text(size=16, colour="black", hjust=0), 
                     axis.title.y=element_text(size=16),
                     axis.title.x=element_text(size=16)) + 
  annotation_custom(grobTree(textGrob("r=-0.37", x=0.8,  y=0.9, hjust=0,
                                      gp=gpar(col="black", fontsize=12))))

pdf("figure_2a.pdf", width=4, height=4) 
print(p)
dev.off()


#Panel b
p <- ggplot(r, aes(x=vagueness, y=compliance_percent)) + 
  geom_point() +
  geom_smooth(method="lm", se=F) + 
  scale_x_continuous(name="Vagueness") +
  scale_y_continuous(name="Percentage") +
  theme_bw() + theme(axis.line=element_line(color="black"),
                     axis.text.x=element_text(size=16, colour="black"),
                     axis.text.y=element_text(size=16, colour="black", hjust=0), 
                     axis.title.y=element_text(size=16),
                     axis.title.x=element_text(size=16)) + 
  annotation_custom(grobTree(textGrob("r=-0.45", x=0.8,  y=0.9, hjust=0,
                                      gp=gpar(col="black", fontsize=12))))

pdf("figure_2b.pdf", width=4, height=4) 
print(p)
dev.off()



######################  
###FIGURE A1
######################

data_cor <- data

data_cor$nr_words <- as.numeric(data_cor$nr_words)
data_cor$syntactic_complexity <- as.numeric(data_cor$syntactic_complexity)

data_cor <- data_cor %>% rename("Modifiers/Words"="syntactic_complexity", 
                                "Number of Provisions"="number_of_provisions", 
                                "Vagueness"="vagueness", 
                                "Number of Words"= "nr_words")



data_cor <- subset(data_cor, select=c("Modifiers/Words", "Vagueness", "Number of Words", "Number of Provisions", "FRE", "TTR"))

data_cor <- cor(data_cor)

pdf("figure_A1.pdf", width=15, height=10)
corrplot(data_cor, method='shade', diag=FALSE, order='original', cl.cex=2, tl.cex=2)
dev.off()



######################  
###FIGURES A2 and A3
######################

data_policy <- subset(data, select=c("celex", "policy", "syntactic_complexity", "vagueness"))
data_policy <- unique(data_policy)


#Figure A2
p <- ggplot(aes(x=as.factor(policy), y=syntactic_complexity), data=data_policy) + 
  stat_summary(fun="mean", geom="bar") +  coord_flip() + 
  labs(fill="Syntactic Complexity") + xlab("") + ylab("") + 
  theme_bw() + theme(axis.text=element_text(size=40))

pdf("figure_A2.pdf", width=15, height=15) 
print(p)
dev.off()


#Figure A3
p <- ggplot(aes(x=as.factor(policy), y=vagueness), data=data_policy) + 
  stat_summary(fun="mean", geom="bar") +  coord_flip() + 
  labs(fill="Vagueness") + xlab("") + ylab("") + 
  theme_bw() + theme(axis.text=element_text(size=40))

pdf("figure_A3.pdf", width=15, height=15) 
print(p)
dev.off()



######################  
###FIGURE A4
######################

data_lc <- subset(data, celex=="32000L0078" | celex=="32001L0005" | celex=="32001L0016" | celex=="32001L0029" |celex=="32001L0055" |celex=="32002L0007" |celex=="32000L0055" |celex=="32002L0021"| celex=="32000L0026"| celex=="32001L0110")

r_implementation_lc <- data_lc %>% group_by(celex) %>% summarise(compliance=sum(implementation))

r_total_lc <- data_lc %>% group_by(celex) %>%  summarise(total=sum(x))

r_measures_lc <- subset(data_lc, select=c("celex", "vagueness", "syntactic_complexity"))
r_measures_lc <- unique(r_measures_lc)

r_lc <- merge(r_implementation_lc, r_total_lc, by=c("celex"))
r_lc <- merge(r_lc, r_measures_lc, by=c("celex"))

r_lc$compliance_percent <- (r_lc$compliance/r_lc$total)*100

#Panel a
p <- ggplot(r_lc, aes(x=syntactic_complexity, y=compliance_percent)) + 
  geom_point() +
  geom_smooth(method="lm", se=F) + 
  scale_x_continuous(name="Syntactic Complexity", limits=c(3,11), breaks=seq(3,11, by=1)) +
  scale_y_continuous(name="Percentage") +
  theme_bw() + theme(axis.line=element_line(color="black"),
                     axis.text.x=element_text(size=16, colour="black"),
                     axis.text.y=element_text(size=16, colour="black", hjust=0), 
                     axis.title.y=element_text(size=16),
                     axis.title.x=element_text(size=16)) + 
  annotation_custom(grobTree(textGrob("r=-0.43", x=0.8,  y=0.9, hjust=0,
                                      gp=gpar(col="black", fontsize=12))))

pdf("figure_A4a.pdf", width=4, height=4) 
print(p)
dev.off()


#Panel b
p <- ggplot(r_lc, aes(x=vagueness, y=compliance_percent)) + 
  geom_point() +
  geom_smooth(method="lm", se=F) + 
  scale_x_continuous(name="Vagueness") +
  scale_y_continuous(name="Percentage") +
  theme_bw() + theme(axis.line=element_line(color="black"),
                     axis.text.x=element_text(size=16, colour="black"),
                     axis.text.y=element_text(size=16, colour="black", hjust=0), 
                     axis.title.y=element_text(size=16),
                     axis.title.x=element_text(size=16)) + 
  annotation_custom(grobTree(textGrob("r=-0.53", x=0.8,  y=0.9, hjust=0,
                     gp=gpar(col="black", fontsize=12))))

pdf("figure_A4b.pdf", width=4, height=4) 
print(p)
dev.off()



######################  
###FIGURE A5
######################

data_hc <- subset(data, celex=="32003L0049" | celex=="31999L0044" | celex=="31999L0093" | celex=="32000L0046" |celex=="32001L0112" |celex=="31999L0105" |celex=="32000L0036" |celex=="32000L0031"| celex=="32001L0084"| celex=="32001L0037"| celex=="31999L0074")

r_implementation_hc <- data_hc %>% group_by(celex) %>% summarise(compliance=sum(implementation))

r_total_hc <- data_hc %>% group_by(celex) %>%  summarise(total=sum(x))

r_measures_hc <- subset(data_hc, select=c("celex", "vagueness", "syntactic_complexity"))
r_measures_hc <- unique(r_measures_hc)

r_hc <- merge(r_implementation_hc, r_total_hc, by=c("celex"))
r_hc <- merge(r_hc, r_measures_hc, by=c("celex"))

r_hc$compliance_percent <- (r_hc$compliance/r_hc$total)*100

#Panel a
p <- ggplot(r_hc, aes(x=syntactic_complexity, y=compliance_percent)) + 
  geom_point() +
  geom_smooth(method="lm", se=F) + 
  scale_x_continuous(name="Syntactic Complexity", limits=c(3,11), breaks=seq(3,11, by=1)) +
  scale_y_continuous(name="Percentage") +
  theme_bw() + theme(axis.line=element_line(color="black"),
                     axis.text.x=element_text(size=16, colour="black"),
                     axis.text.y=element_text(size=16, colour="black", hjust=0), 
                     axis.title.y=element_text(size=16),
                     axis.title.x=element_text(size=16)) + 
  annotation_custom(grobTree(textGrob("r=-0.31", x=0.8, y=0.9, hjust=0,
                     gp=gpar(col="black", fontsize=12))))

pdf("figure_A5a.pdf", width=4, height=4) 
print(p)
dev.off()

#Panel b
p <- ggplot(r_hc, aes(x=vagueness, y=compliance_percent)) + 
  geom_point() +
  geom_smooth(method="lm", se=F) + 
  scale_x_continuous(name="Vagueness") +
  scale_y_continuous(name="Percentage") +
  theme_bw() + theme(axis.line=element_line(color="black"),
                     axis.text.x=element_text(size=16, colour="black"),
                     axis.text.y=element_text(size=16, colour="black", hjust=0), 
                     axis.title.y=element_text(size=16),
                     axis.title.x=element_text(size=16)) + annotation_custom(grobTree(textGrob("r=-0.34", x=0.8,  y=0.9, hjust=0,
                                                                                               gp=gpar(col="black", fontsize=12))))
pdf("figure_A5b.pdf", width=4, height=4) 
print(p)
dev.off()



######################  
###Table A4
######################


labs <- data.frame(name1 = c("implementation",
                    "syntactic_complexity",
                    "vagueness",
                    "efficient_score",
                    "interest",
                    "typodir",
                    "nr_words",
                    "amending",
                    "msdis_sum",
                    "couconf_sum",
                    "govchange",
                    "discretion.index_ms",
                    "delegationratio"),
                   name2 = c("Implementation",
                                     "Syntactic Complexity",
                                     "Vagueness",
                                     "Bureaucratic Performance",
                                     "Interest Group Pluralism",
                                     "Directive Type",
                                     "Number of Words",
                                     "Amending Legislation",
                                     "Member State's Disagreement",
                                     "Diversity of Member States' Interests",
                                     "Government Change",
                                     "Discretion Index",
                                     "Delegation Ratio"))

st(data, vars = c("implementation", "syntactic_complexity", "vagueness", "efficient_score", "interest", "typodir", "nr_words", "amending", "msdis_sum", "couconf_sum", "govchange", "discretion.index_ms", "delegationratio"), out="latex", file="Table_A4", labels=labs)